Speech Emotion Recognition with CNN

Data Exploration

In [1]:
# Import libraries 
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import glob 
from sklearn.metrics import confusion_matrix
import IPython.display as ipd  # To play sound in the notebook
import os
import sys
import warnings
# ignore warnings 
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 
In [2]:
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

TESS = "/kaggle/input/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/"
RAV = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/"
SAVEE = "/kaggle/input/surrey-audiovisual-expressed-emotion-savee/ALL/"
CREMA = "/kaggle/input/cremad/AudioWAV/"

# Run one example 
dir_list = os.listdir(SAVEE)
dir_list[0:5]
Out[2]:
['DC_su10.wav', 'DC_d03.wav', 'JK_sa02.wav', 'KL_sa14.wav', 'DC_sa15.wav']

Surrey Audio-Visual Expressed Emotion (SAVEE)

In [3]:
# Get the data location for SAVEE
dir_list = os.listdir(SAVEE)

# parse the filename to get the emotions
emotion=[]
path = []
for i in dir_list:
    if i[-8:-6]=='_a':
        emotion.append('male_angry')
    elif i[-8:-6]=='_d':
        emotion.append('male_disgust')
    elif i[-8:-6]=='_f':
        emotion.append('male_fear')
    elif i[-8:-6]=='_h':
        emotion.append('male_happy')
    elif i[-8:-6]=='_n':
        emotion.append('male_neutral')
    elif i[-8:-6]=='sa':
        emotion.append('male_sad')
    elif i[-8:-6]=='su':
        emotion.append('male_surprise')
    else:
        emotion.append('male_error') 
    path.append(SAVEE + i)

# Now check out the label count distribution 
SAVEE_df = pd.DataFrame(emotion, columns = ['labels'])
SAVEE_df['source'] = 'SAVEE'
SAVEE_df = pd.concat([SAVEE_df, pd.DataFrame(path, columns = ['path'])], axis = 1)
SAVEE_df.labels.value_counts()
Out[3]:
male_neutral     120
male_happy        60
male_fear         60
male_disgust      60
male_surprise     60
male_sad          60
male_angry        60
Name: labels, dtype: int64
In [4]:
# use the well known Librosa library for this task 
fname = SAVEE + 'DC_f11.wav'  
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)

# Lets play the audio 
ipd.Audio(fname)
Out[4]:
In [5]:
# Lets play a happy track
fname = SAVEE + 'DC_h11.wav'  
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)

# Lets play the audio 
ipd.Audio(fname)
Out[5]:

Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS)

In [6]:
dir_list = os.listdir(RAV)
dir_list.sort()

emotion = []
gender = []
path = []
for i in dir_list:
    fname = os.listdir(RAV + i)
    for f in fname:
        part = f.split('.')[0].split('-')
        emotion.append(int(part[2]))
        temp = int(part[6])
        if temp%2 == 0:
            temp = "female"
        else:
            temp = "male"
        gender.append(temp)
        path.append(RAV + i + '/' + f)

        
RAV_df = pd.DataFrame(emotion)
RAV_df = RAV_df.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'})
RAV_df = pd.concat([pd.DataFrame(gender),RAV_df],axis=1)
RAV_df.columns = ['gender','emotion']
RAV_df['labels'] =RAV_df.gender + '_' + RAV_df.emotion
RAV_df['source'] = 'RAVDESS'  
RAV_df = pd.concat([RAV_df,pd.DataFrame(path, columns = ['path'])],axis=1)
RAV_df = RAV_df.drop(['gender', 'emotion'], axis=1)
RAV_df.labels.value_counts()
Out[6]:
male_neutral       144
female_neutral     144
male_happy          96
male_fear           96
male_surprise       96
male_angry          96
male_sad            96
female_sad          96
female_angry        96
female_surprise     96
female_disgust      96
female_happy        96
male_disgust        96
female_fear         96
Name: labels, dtype: int64
In [7]:
# Pick a fearful track
fname = RAV + 'Actor_14/03-01-06-02-02-02-14.wav'  
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)

# Lets play the audio 
ipd.Audio(fname)
Out[7]:
In [8]:
# Pick a happy track
fname = RAV + 'Actor_14/03-01-03-02-02-02-14.wav'  
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)

# Lets play the audio 
ipd.Audio(fname)
Out[8]:

Toronto Emotional Speech Set (TESS)

In [9]:
dir_list = os.listdir(TESS)
dir_list.sort()
dir_list
Out[9]:
['OAF_Fear',
 'OAF_Pleasant_surprise',
 'OAF_Sad',
 'OAF_angry',
 'OAF_disgust',
 'OAF_happy',
 'OAF_neutral',
 'YAF_angry',
 'YAF_disgust',
 'YAF_fear',
 'YAF_happy',
 'YAF_neutral',
 'YAF_pleasant_surprised',
 'YAF_sad']
In [10]:
path = []
emotion = []

for i in dir_list:
    fname = os.listdir(TESS + i)
    for f in fname:
        if i == 'OAF_angry' or i == 'YAF_angry':
            emotion.append('female_angry')
        elif i == 'OAF_disgust' or i == 'YAF_disgust':
            emotion.append('female_disgust')
        elif i == 'OAF_Fear' or i == 'YAF_fear':
            emotion.append('female_fear')
        elif i == 'OAF_happy' or i == 'YAF_happy':
            emotion.append('female_happy')
        elif i == 'OAF_neutral' or i == 'YAF_neutral':
            emotion.append('female_neutral')                                
        elif i == 'OAF_Pleasant_surprise' or i == 'YAF_pleasant_surprised':
            emotion.append('female_surprise')               
        elif i == 'OAF_Sad' or i == 'YAF_sad':
            emotion.append('female_sad')
        else:
            emotion.append('Unknown')
        path.append(TESS + i + "/" + f)

TESS_df = pd.DataFrame(emotion, columns = ['labels'])
TESS_df['source'] = 'TESS'
TESS_df = pd.concat([TESS_df,pd.DataFrame(path, columns = ['path'])],axis=1)
TESS_df.labels.value_counts()
Out[10]:
female_sad         400
female_disgust     400
female_fear        400
female_angry       400
female_neutral     400
female_surprise    400
female_happy       400
Name: labels, dtype: int64
In [11]:
# lets play a fearful track 
fname = TESS + 'YAF_fear/YAF_dog_fear.wav' 

data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)

# Lets play the audio 
ipd.Audio(fname)
Out[11]:
In [12]:
# lets play a happy track 
fname =  TESS + 'YAF_happy/YAF_dog_happy.wav' 

data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)

# Lets play the audio 
ipd.Audio(fname)
Out[12]:

Crowd Sourced Emotional Multimodal Actors Dataset (CREDMA-D)

In [13]:
dir_list = os.listdir(CREMA)
dir_list.sort()
print(dir_list[0:10])
['1001_DFA_ANG_XX.wav', '1001_DFA_DIS_XX.wav', '1001_DFA_FEA_XX.wav', '1001_DFA_HAP_XX.wav', '1001_DFA_NEU_XX.wav', '1001_DFA_SAD_XX.wav', '1001_IEO_ANG_HI.wav', '1001_IEO_ANG_LO.wav', '1001_IEO_ANG_MD.wav', '1001_IEO_DIS_HI.wav']
In [14]:
gender = []
emotion = []
path = []
female = [1002,1003,1004,1006,1007,1008,1009,1010,1012,1013,1018,1020,1021,1024,1025,1028,1029,1030,1037,1043,1046,1047,1049,
          1052,1053,1054,1055,1056,1058,1060,1061,1063,1072,1073,1074,1075,1076,1078,1079,1082,1084,1089,1091]

for i in dir_list: 
    part = i.split('_')
    if int(part[0]) in female:
        temp = 'female'
    else:
        temp = 'male'
    gender.append(temp)
    if part[2] == 'SAD' and temp == 'male':
        emotion.append('male_sad')
    elif part[2] == 'ANG' and temp == 'male':
        emotion.append('male_angry')
    elif part[2] == 'DIS' and temp == 'male':
        emotion.append('male_disgust')
    elif part[2] == 'FEA' and temp == 'male':
        emotion.append('male_fear')
    elif part[2] == 'HAP' and temp == 'male':
        emotion.append('male_happy')
    elif part[2] == 'NEU' and temp == 'male':
        emotion.append('male_neutral')
    elif part[2] == 'SAD' and temp == 'female':
        emotion.append('female_sad')
    elif part[2] == 'ANG' and temp == 'female':
        emotion.append('female_angry')
    elif part[2] == 'DIS' and temp == 'female':
        emotion.append('female_disgust')
    elif part[2] == 'FEA' and temp == 'female':
        emotion.append('female_fear')
    elif part[2] == 'HAP' and temp == 'female':
        emotion.append('female_happy')
    elif part[2] == 'NEU' and temp == 'female':
        emotion.append('female_neutral')
    else:
        emotion.append('Unknown')
    path.append(CREMA + i)
    
CREMA_df = pd.DataFrame(emotion, columns = ['labels'])
CREMA_df['source'] = 'CREMA'
CREMA_df = pd.concat([CREMA_df,pd.DataFrame(path, columns = ['path'])],axis=1)
CREMA_df.labels.value_counts()
Out[14]:
male_happy        671
male_fear         671
male_angry        671
male_sad          671
male_disgust      671
female_sad        600
female_disgust    600
female_fear       600
female_angry      600
female_happy      600
male_neutral      575
female_neutral    512
Name: labels, dtype: int64
In [15]:
# use the well known Librosa library for this task 
fname = CREMA + '1012_IEO_HAP_HI.wav'  
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)

# Lets play the audio 
ipd.Audio(fname)
Out[15]:
In [16]:
# A fearful track
fname = CREMA + '1012_IEO_FEA_HI.wav'  
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)

# Lets play the audio 
ipd.Audio(fname)
Out[16]:

Combining the 4 Datasets

In [17]:
df = pd.concat([SAVEE_df, RAV_df, TESS_df, CREMA_df], axis = 0)
print(df.labels.value_counts())
df.head()
df.to_csv("Data_path.csv",index=False)
female_sad         1096
female_fear        1096
female_happy       1096
female_disgust     1096
female_angry       1096
female_neutral     1056
male_neutral        839
male_fear           827
male_disgust        827
male_happy          827
male_angry          827
male_sad            827
female_surprise     496
male_surprise       156
Name: labels, dtype: int64

Feature Extraction

In [18]:
# Import our libraries
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
import pandas as pd
import os
import IPython.display as ipd  # To play sound in the notebook
In [19]:
# Source - RAVDESS; Gender - Female; Emotion - Angry 
path = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_08/03-01-05-02-01-01-08.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)

# audio wave
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.waveplot(X, sr=sample_rate)
plt.title('Audio sampled at 44100 hrz')

# MFCC
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()

ipd.Audio(path)
Out[19]:
In [20]:
# Source - RAVDESS; Gender - Male; Emotion - Angry 
path = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_09/03-01-05-01-01-01-09.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)

# audio wave
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.waveplot(X, sr=sample_rate)
plt.title('Audio sampled at 44100 hrz')

# MFCC
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()

ipd.Audio(path)
Out[20]:
In [21]:
# Source - RAVDESS; Gender - Female; Emotion - Happy 
path = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_12/03-01-03-01-02-01-12.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)

# audio wave
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.waveplot(X, sr=sample_rate)
plt.title('Audio sampled at 44100 hrz')

# MFCC
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()

ipd.Audio(path)
Out[21]:
In [22]:
# Source - RAVDESS; Gender - Male; Emotion - Happy 
path = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_11/03-01-03-01-02-02-11.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)

# audio wave
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.waveplot(X, sr=sample_rate)
plt.title('Audio sampled at 44100 hrz')

# MFCC
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()

ipd.Audio(path)
Out[22]:
In [23]:
# Source - RAVDESS; Gender - Female; Emotion - Angry 
path = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_08/03-01-05-02-01-01-08.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
female = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
female = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(female))

# Source - RAVDESS; Gender - Male; Emotion - Angry 
path = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_09/03-01-05-01-01-01-09.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
male = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
male = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(male))

# audio wave
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
plt.plot(female, label='female')
plt.plot(male, label='male')
plt.legend()
216
216
Out[23]:
<matplotlib.legend.Legend at 0x7ff9c4a0e630>
In [24]:
# Source - RAVDESS; Gender - Female; Emotion - happy 
path = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_12/03-01-03-01-02-01-12.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
female = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
female = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(female))

# Source - RAVDESS; Gender - Male; Emotion - happy 
path = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_11/03-01-03-01-02-02-11.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
male = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
male = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(male))

# Plot the two audio waves together
plt.figure(figsize=(20, 15))
plt.subplot(3,1,1)
plt.plot(female, label='female')
plt.plot(male, label='male')
plt.legend()
216
216
Out[24]:
<matplotlib.legend.Legend at 0x7ff9c49edf98>

Data Preparation & Processing

In [25]:
# Importing required libraries 
# Keras
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint

# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Other  
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob 
import os
import pickle
import IPython.display as ipd  # To play sound in the notebook
Using TensorFlow backend.
In [26]:
# lets pick up the meta-data that we got from our first part of the Kernel
ref = pd.read_csv("/kaggle/input/datapath/Data_path.csv")
ref.head()
Out[26]:
labels source path
0 male_fear SAVEE /kaggle/input/surrey-audiovisual-expressed-emo...
1 male_neutral SAVEE /kaggle/input/surrey-audiovisual-expressed-emo...
2 male_disgust SAVEE /kaggle/input/surrey-audiovisual-expressed-emo...
3 male_neutral SAVEE /kaggle/input/surrey-audiovisual-expressed-emo...
4 male_neutral SAVEE /kaggle/input/surrey-audiovisual-expressed-emo...
In [27]:
# Note this takes a couple of minutes (~10 mins) as we're iterating over 4 datasets 
df = pd.DataFrame(columns=['feature'])

# loop feature extraction over the entire dataset
counter=0
for index,path in enumerate(ref.path):
    X, sample_rate = librosa.load(path
                                  , res_type='kaiser_fast'
                                  ,duration=2.5
                                  ,sr=44100
                                  ,offset=0.5
                                 )
    sample_rate = np.array(sample_rate)
    
    # mean as the feature. Could do min and max etc as well. 
    mfccs = np.mean(librosa.feature.mfcc(y=X, 
                                        sr=sample_rate, 
                                        n_mfcc=13),
                    axis=0)
    df.loc[counter] = [mfccs]
    counter=counter+1   

# Check a few records to make sure its processed successfully
print(len(df))
df.head()
12162
Out[27]:
feature
0 [-13.442276, -8.0205965, -5.986821, -6.373723,...
1 [-20.744328, -14.986705, -10.742075, -9.125931...
2 [-24.295418, -25.178337, -28.674786, -30.55396...
3 [-15.210331, -12.718095, -10.783083, -9.869562...
4 [-27.552608, -24.492954, -24.695808, -26.65125...
In [28]:
# Now extract the mean bands to its own feature columns
df = pd.concat([ref,pd.DataFrame(df['feature'].values.tolist())],axis=1)
df[:5]
Out[28]:
labels source path 0 1 2 3 4 5 6 ... 206 207 208 209 210 211 212 213 214 215
0 male_fear SAVEE /kaggle/input/surrey-audiovisual-expressed-emo... -13.442276 -8.020597 -5.986821 -6.373723 -5.190185 -4.239274 -3.880404 ... -11.022025 -10.693925 -10.625393 -10.843586 -10.529748 -9.981356 -10.076421 -9.949388 -4.860995 -0.177099
1 male_neutral SAVEE /kaggle/input/surrey-audiovisual-expressed-emo... -20.744328 -14.986705 -10.742075 -9.125931 -9.668313 -10.378889 -11.779993 ... -7.531629 -7.365328 -7.629715 -8.144470 -8.882616 -9.492288 -10.136461 -10.665747 -10.236475 -8.631506
2 male_disgust SAVEE /kaggle/input/surrey-audiovisual-expressed-emo... -24.295418 -25.178337 -28.674786 -30.553968 -30.306000 -30.717464 -31.367340 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 male_neutral SAVEE /kaggle/input/surrey-audiovisual-expressed-emo... -15.210331 -12.718095 -10.783083 -9.869562 -10.170098 -10.380077 -11.160923 ... -9.734767 -9.951584 -9.711096 -9.834348 -9.618107 -8.442469 -8.556032 -8.778090 -6.676293 -4.096513
4 male_neutral SAVEE /kaggle/input/surrey-audiovisual-expressed-emo... -27.552608 -24.492954 -24.695808 -26.651257 -25.564135 -24.906410 -23.095161 ... -10.612562 -7.081086 -6.986003 -6.659358 -7.839975 -8.755316 -7.988830 -8.845961 -5.263283 -1.825724

5 rows × 219 columns

In [29]:
# replace NA with 0
df=df.fillna(0)
print(df.shape)
df[:5]
(12162, 219)
Out[29]:
labels source path 0 1 2 3 4 5 6 ... 206 207 208 209 210 211 212 213 214 215
0 male_fear SAVEE /kaggle/input/surrey-audiovisual-expressed-emo... -13.442276 -8.020597 -5.986821 -6.373723 -5.190185 -4.239274 -3.880404 ... -11.022025 -10.693925 -10.625393 -10.843586 -10.529748 -9.981356 -10.076421 -9.949388 -4.860995 -0.177099
1 male_neutral SAVEE /kaggle/input/surrey-audiovisual-expressed-emo... -20.744328 -14.986705 -10.742075 -9.125931 -9.668313 -10.378889 -11.779993 ... -7.531629 -7.365328 -7.629715 -8.144470 -8.882616 -9.492288 -10.136461 -10.665747 -10.236475 -8.631506
2 male_disgust SAVEE /kaggle/input/surrey-audiovisual-expressed-emo... -24.295418 -25.178337 -28.674786 -30.553968 -30.306000 -30.717464 -31.367340 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
3 male_neutral SAVEE /kaggle/input/surrey-audiovisual-expressed-emo... -15.210331 -12.718095 -10.783083 -9.869562 -10.170098 -10.380077 -11.160923 ... -9.734767 -9.951584 -9.711096 -9.834348 -9.618107 -8.442469 -8.556032 -8.778090 -6.676293 -4.096513
4 male_neutral SAVEE /kaggle/input/surrey-audiovisual-expressed-emo... -27.552608 -24.492954 -24.695808 -26.651257 -25.564135 -24.906410 -23.095161 ... -10.612562 -7.081086 -6.986003 -6.659358 -7.839975 -8.755316 -7.988830 -8.845961 -5.263283 -1.825724

5 rows × 219 columns

Splitting the Dataset into Training and Test

In [30]:
# Split between train and test 
X_train, X_test, y_train, y_test = train_test_split(df.drop(['path','labels','source'],axis=1)
                                                    , df.labels
                                                    , test_size=0.25
                                                    , shuffle=True
                                                    , random_state=42
                                                   )

# Lets see how the data present itself before normalisation 
X_train[150:160]
Out[30]:
0 1 2 3 4 5 6 7 8 9 ... 206 207 208 209 210 211 212 213 214 215
4950 -18.611179 -17.616539 -18.411484 -18.987419 -17.404621 -16.747272 -17.733747 -18.055025 -17.931210 -15.913172 ... -22.899403 -21.647816 -19.758656 -18.879402 -19.397377 -20.171659 -22.689243 -24.612814 -24.153776 -22.703135
3860 -18.993063 -21.756165 -29.215248 -27.261314 -21.186777 -21.748455 -24.759428 -28.835772 -30.493805 -28.646847 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
9761 -1.533947 -4.030602 -9.614023 -12.045173 -9.992992 -11.926250 -14.008465 -13.561555 -14.024568 -15.151947 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
7620 -4.531077 -3.933792 -4.567834 -5.871509 -5.282475 -6.490459 -8.156466 -9.188803 -8.681725 -8.212409 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
11586 -20.621702 -21.587507 -20.563646 -20.703459 -21.205715 -18.608534 -18.446669 -16.211845 -14.257651 -15.160404 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
7914 -17.514988 -18.551867 -17.043016 -16.977903 -19.369633 -19.562126 -22.008749 -20.178385 -17.989597 -19.336285 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
9513 -18.740368 -18.824930 -16.149488 -16.963457 -18.229979 -18.183952 -19.274342 -18.395123 -16.951286 -16.672031 ... -17.882130 -19.390713 -17.779472 -19.165974 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
5835 -19.066849 -18.328381 -17.710285 -18.043192 -18.252480 -18.710625 -16.626352 -17.831005 -18.028343 -17.859104 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
5389 -20.760590 -20.047138 -18.961346 -19.468687 -19.316292 -18.162563 -18.102333 -19.914133 -20.931385 -19.215496 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
11222 -18.252924 -17.727373 -19.222475 -18.469971 -17.572325 -17.850542 -17.932026 -20.588900 -18.612183 -15.990726 ... -18.065437 -18.135090 -19.665306 -20.741905 -20.273037 -18.371035 -15.576723 -17.512489 -17.008547 -18.195284

10 rows × 216 columns

In [31]:
# Lts do data normalization 
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

# Check the dataset now 
X_train[150:160]
Out[31]:
0 1 2 3 4 5 6 7 8 9 ... 206 207 208 209 210 211 212 213 214 215
4950 0.188950 0.305118 0.437436 0.388285 0.498412 0.542670 0.462011 0.432879 0.435500 0.580216 ... -0.872288 -0.805973 -0.686809 -0.625868 -0.671261 -0.714047 -0.856760 -0.979458 -0.962763 -0.871126
3860 0.162312 0.005071 -0.362560 -0.222813 0.219588 0.174005 -0.054661 -0.359106 -0.486929 -0.354440 ... 0.540713 0.527132 0.526471 0.527229 0.512277 0.512199 0.512920 0.501491 0.496899 0.492661
9761 1.380186 1.289851 1.088870 0.901029 1.044805 0.898055 0.735970 0.762982 0.722352 0.636090 ... 0.540713 0.527132 0.526471 0.527229 0.512277 0.512199 0.512920 0.501491 0.496899 0.492661
7620 1.171119 1.296868 1.462530 1.357007 1.392069 1.298758 1.166329 1.084217 1.114659 1.145455 ... 0.540713 0.527132 0.526471 0.527229 0.512277 0.512199 0.512920 0.501491 0.496899 0.492661
11586 0.048705 0.017295 0.278073 0.261541 0.218192 0.405466 0.409582 0.568284 0.705237 0.635469 ... 0.540713 0.527132 0.526471 0.527229 0.512277 0.512199 0.512920 0.501491 0.496899 0.492661
7914 0.265416 0.237324 0.538768 0.536705 0.353550 0.335172 0.147625 0.276890 0.431213 0.328958 ... 0.540713 0.527132 0.526471 0.527229 0.512277 0.512199 0.512920 0.501491 0.496899 0.492661
9513 0.179939 0.217532 0.604932 0.537772 0.437566 0.436765 0.348715 0.407894 0.507452 0.524515 ... -0.562698 -0.666977 -0.565277 -0.643370 0.512277 0.512199 0.512920 0.501491 0.496899 0.492661
5835 0.157165 0.253523 0.489358 0.458024 0.435907 0.397941 0.543449 0.449336 0.428368 0.437384 ... 0.540713 0.527132 0.526471 0.527229 0.512277 0.512199 0.512920 0.501491 0.496899 0.492661
5389 0.039016 0.128944 0.396720 0.352739 0.357482 0.438341 0.434905 0.296303 0.215207 0.337824 ... 0.540713 0.527132 0.526471 0.527229 0.512277 0.512199 0.512920 0.501491 0.496899 0.492661
11222 0.213941 0.297085 0.377384 0.426503 0.486049 0.461342 0.447429 0.246733 0.385498 0.574523 ... -0.574009 -0.589654 -0.681077 -0.739624 -0.724690 -0.604586 -0.427399 -0.552233 -0.530962 -0.600338

10 rows × 216 columns

In [32]:
# Lets few preparation steps to get it into the correct format for Keras 
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# one hot encode the target 
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))

print(X_train.shape)
print(lb.classes_)
#print(y_train[0:10])
#print(y_test[0:10])

# Pickel the lb object for future use 
filename = 'labels'
outfile = open(filename,'wb')
pickle.dump(lb,outfile)
outfile.close()
(9121, 216)
['female_angry' 'female_disgust' 'female_fear' 'female_happy'
 'female_neutral' 'female_sad' 'female_surprise' 'male_angry'
 'male_disgust' 'male_fear' 'male_happy' 'male_neutral' 'male_sad'
 'male_surprise']
In [33]:
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)
X_train.shape
Out[33]:
(9121, 216, 1)

Modelling

In [34]:
# New model
model = Sequential()
model.add(Conv1D(256, 8, padding='same',input_shape=(X_train.shape[1],1)))  # X_train.shape[1] = No. of Columns
model.add(Activation('relu'))
model.add(Conv1D(256, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(128, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(14)) # Target class number
model.add(Activation('softmax'))
# opt = keras.optimizers.SGD(lr=0.0001, momentum=0.0, decay=0.0, nesterov=False)
# opt = keras.optimizers.Adam(lr=0.0001)
opt = keras.optimizers.rmsprop(lr=0.00001, decay=1e-6)
model.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv1d_1 (Conv1D)            (None, 216, 256)          2304      
_________________________________________________________________
activation_1 (Activation)    (None, 216, 256)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 216, 256)          524544    
_________________________________________________________________
batch_normalization_1 (Batch (None, 216, 256)          1024      
_________________________________________________________________
activation_2 (Activation)    (None, 216, 256)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 216, 256)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 27, 256)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 27, 128)           262272    
_________________________________________________________________
activation_3 (Activation)    (None, 27, 128)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 27, 128)           131200    
_________________________________________________________________
activation_4 (Activation)    (None, 27, 128)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 27, 128)           131200    
_________________________________________________________________
activation_5 (Activation)    (None, 27, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 27, 128)           131200    
_________________________________________________________________
batch_normalization_2 (Batch (None, 27, 128)           512       
_________________________________________________________________
activation_6 (Activation)    (None, 27, 128)           0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 27, 128)           0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 3, 128)            0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 3, 64)             65600     
_________________________________________________________________
activation_7 (Activation)    (None, 3, 64)             0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 3, 64)             32832     
_________________________________________________________________
activation_8 (Activation)    (None, 3, 64)             0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 192)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 14)                2702      
_________________________________________________________________
activation_9 (Activation)    (None, 14)                0         
=================================================================
Total params: 1,285,390
Trainable params: 1,284,622
Non-trainable params: 768
_________________________________________________________________

Description of CNN-Model

In [35]:
model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])
model_history=model.fit(X_train, y_train, batch_size=16, epochs=100, validation_data=(X_test, y_test))
Train on 9121 samples, validate on 3041 samples
Epoch 1/100
9121/9121 [==============================] - 12s 1ms/step - loss: 2.3832 - accuracy: 0.1960 - val_loss: 2.4170 - val_accuracy: 0.2230
Epoch 2/100
9121/9121 [==============================] - 7s 796us/step - loss: 2.1656 - accuracy: 0.2677 - val_loss: 2.2233 - val_accuracy: 0.2782
Epoch 3/100
9121/9121 [==============================] - 7s 794us/step - loss: 2.0703 - accuracy: 0.2942 - val_loss: 2.1543 - val_accuracy: 0.2983
Epoch 4/100
9121/9121 [==============================] - 7s 801us/step - loss: 2.0016 - accuracy: 0.3148 - val_loss: 2.0984 - val_accuracy: 0.3229
Epoch 5/100
9121/9121 [==============================] - 7s 789us/step - loss: 1.9467 - accuracy: 0.3327 - val_loss: 2.0452 - val_accuracy: 0.3384
Epoch 6/100
9121/9121 [==============================] - 7s 804us/step - loss: 1.9025 - accuracy: 0.3427 - val_loss: 2.0126 - val_accuracy: 0.3427
Epoch 7/100
9121/9121 [==============================] - 7s 813us/step - loss: 1.8608 - accuracy: 0.3596 - val_loss: 1.9799 - val_accuracy: 0.3519
Epoch 8/100
9121/9121 [==============================] - 7s 800us/step - loss: 1.8253 - accuracy: 0.3713 - val_loss: 1.9591 - val_accuracy: 0.3528
Epoch 9/100
9121/9121 [==============================] - 7s 797us/step - loss: 1.7916 - accuracy: 0.3860 - val_loss: 1.9234 - val_accuracy: 0.3604
Epoch 10/100
9121/9121 [==============================] - 7s 789us/step - loss: 1.7597 - accuracy: 0.3933 - val_loss: 1.8995 - val_accuracy: 0.3841
Epoch 11/100
9121/9121 [==============================] - 7s 789us/step - loss: 1.7368 - accuracy: 0.4071 - val_loss: 1.8819 - val_accuracy: 0.3900
Epoch 12/100
9121/9121 [==============================] - 7s 800us/step - loss: 1.7144 - accuracy: 0.4142 - val_loss: 1.8674 - val_accuracy: 0.3864
Epoch 13/100
9121/9121 [==============================] - 7s 790us/step - loss: 1.6922 - accuracy: 0.4207 - val_loss: 1.8390 - val_accuracy: 0.3923
Epoch 14/100
9121/9121 [==============================] - 7s 796us/step - loss: 1.6772 - accuracy: 0.4210 - val_loss: 1.8433 - val_accuracy: 0.3828
Epoch 15/100
9121/9121 [==============================] - 7s 794us/step - loss: 1.6576 - accuracy: 0.4274 - val_loss: 1.8139 - val_accuracy: 0.3949
Epoch 16/100
9121/9121 [==============================] - 7s 794us/step - loss: 1.6416 - accuracy: 0.4318 - val_loss: 1.7975 - val_accuracy: 0.4041
Epoch 17/100
9121/9121 [==============================] - 7s 788us/step - loss: 1.6249 - accuracy: 0.4402 - val_loss: 1.8204 - val_accuracy: 0.3943
Epoch 18/100
9121/9121 [==============================] - 7s 799us/step - loss: 1.6011 - accuracy: 0.4485 - val_loss: 1.7775 - val_accuracy: 0.4104
Epoch 19/100
9121/9121 [==============================] - 7s 799us/step - loss: 1.5922 - accuracy: 0.4466 - val_loss: 1.7689 - val_accuracy: 0.3999
Epoch 20/100
9121/9121 [==============================] - 7s 789us/step - loss: 1.5823 - accuracy: 0.4492 - val_loss: 1.7654 - val_accuracy: 0.4104
Epoch 21/100
9121/9121 [==============================] - 7s 781us/step - loss: 1.5658 - accuracy: 0.4564 - val_loss: 1.7705 - val_accuracy: 0.4005
Epoch 22/100
9121/9121 [==============================] - 7s 782us/step - loss: 1.5579 - accuracy: 0.4616 - val_loss: 1.7510 - val_accuracy: 0.4094
Epoch 23/100
9121/9121 [==============================] - 7s 789us/step - loss: 1.5491 - accuracy: 0.4621 - val_loss: 1.7160 - val_accuracy: 0.4311
Epoch 24/100
9121/9121 [==============================] - 7s 792us/step - loss: 1.5284 - accuracy: 0.4703 - val_loss: 1.7441 - val_accuracy: 0.4097
Epoch 25/100
9121/9121 [==============================] - 7s 785us/step - loss: 1.5230 - accuracy: 0.4722 - val_loss: 1.7395 - val_accuracy: 0.4117
Epoch 26/100
9121/9121 [==============================] - 7s 792us/step - loss: 1.5049 - accuracy: 0.4785 - val_loss: 1.6850 - val_accuracy: 0.4278
Epoch 27/100
9121/9121 [==============================] - 7s 788us/step - loss: 1.4930 - accuracy: 0.4867 - val_loss: 1.6755 - val_accuracy: 0.4331
Epoch 28/100
9121/9121 [==============================] - 7s 802us/step - loss: 1.4869 - accuracy: 0.4812 - val_loss: 1.7088 - val_accuracy: 0.4183
Epoch 29/100
9121/9121 [==============================] - 7s 789us/step - loss: 1.4816 - accuracy: 0.4876 - val_loss: 1.6710 - val_accuracy: 0.4364
Epoch 30/100
9121/9121 [==============================] - 7s 791us/step - loss: 1.4670 - accuracy: 0.4905 - val_loss: 1.6790 - val_accuracy: 0.4380
Epoch 31/100
9121/9121 [==============================] - 7s 781us/step - loss: 1.4618 - accuracy: 0.4944 - val_loss: 1.6756 - val_accuracy: 0.4377
Epoch 32/100
9121/9121 [==============================] - 7s 779us/step - loss: 1.4467 - accuracy: 0.5028 - val_loss: 1.7009 - val_accuracy: 0.4124
Epoch 33/100
9121/9121 [==============================] - 7s 784us/step - loss: 1.4420 - accuracy: 0.5064 - val_loss: 1.6764 - val_accuracy: 0.4308
Epoch 34/100
9121/9121 [==============================] - 7s 794us/step - loss: 1.4332 - accuracy: 0.5038 - val_loss: 1.7214 - val_accuracy: 0.4084
Epoch 35/100
9121/9121 [==============================] - 7s 788us/step - loss: 1.4234 - accuracy: 0.5098 - val_loss: 1.6863 - val_accuracy: 0.4258
Epoch 36/100
9121/9121 [==============================] - 7s 786us/step - loss: 1.4146 - accuracy: 0.5162 - val_loss: 1.6735 - val_accuracy: 0.4301
Epoch 37/100
9121/9121 [==============================] - 8s 861us/step - loss: 1.4035 - accuracy: 0.5141 - val_loss: 1.7098 - val_accuracy: 0.4071
Epoch 38/100
9121/9121 [==============================] - 7s 787us/step - loss: 1.4048 - accuracy: 0.5116 - val_loss: 1.6544 - val_accuracy: 0.4377
Epoch 39/100
9121/9121 [==============================] - 7s 787us/step - loss: 1.3876 - accuracy: 0.5214 - val_loss: 1.6719 - val_accuracy: 0.4272
Epoch 40/100
9121/9121 [==============================] - 7s 789us/step - loss: 1.3783 - accuracy: 0.5248 - val_loss: 1.6589 - val_accuracy: 0.4298
Epoch 41/100
9121/9121 [==============================] - 7s 793us/step - loss: 1.3645 - accuracy: 0.5311 - val_loss: 1.6333 - val_accuracy: 0.4383
Epoch 42/100
9121/9121 [==============================] - 7s 787us/step - loss: 1.3625 - accuracy: 0.5339 - val_loss: 1.6401 - val_accuracy: 0.4344
Epoch 43/100
9121/9121 [==============================] - 7s 793us/step - loss: 1.3500 - accuracy: 0.5347 - val_loss: 1.6111 - val_accuracy: 0.4482
Epoch 44/100
9121/9121 [==============================] - 7s 780us/step - loss: 1.3418 - accuracy: 0.5350 - val_loss: 1.6543 - val_accuracy: 0.4242
Epoch 45/100
9121/9121 [==============================] - 7s 782us/step - loss: 1.3358 - accuracy: 0.5344 - val_loss: 1.6393 - val_accuracy: 0.4301
Epoch 46/100
9121/9121 [==============================] - 7s 785us/step - loss: 1.3297 - accuracy: 0.5451 - val_loss: 1.6266 - val_accuracy: 0.4351
Epoch 47/100
9121/9121 [==============================] - 7s 786us/step - loss: 1.3167 - accuracy: 0.5499 - val_loss: 1.6764 - val_accuracy: 0.4268
Epoch 48/100
9121/9121 [==============================] - 7s 787us/step - loss: 1.3120 - accuracy: 0.5508 - val_loss: 1.6852 - val_accuracy: 0.4249
Epoch 49/100
9121/9121 [==============================] - 7s 796us/step - loss: 1.3064 - accuracy: 0.5527 - val_loss: 1.6381 - val_accuracy: 0.4278
Epoch 50/100
9121/9121 [==============================] - 7s 808us/step - loss: 1.3015 - accuracy: 0.5521 - val_loss: 1.6658 - val_accuracy: 0.4229
Epoch 51/100
9121/9121 [==============================] - 7s 788us/step - loss: 1.2913 - accuracy: 0.5585 - val_loss: 1.6117 - val_accuracy: 0.4482
Epoch 52/100
9121/9121 [==============================] - 7s 795us/step - loss: 1.2837 - accuracy: 0.5586 - val_loss: 1.6154 - val_accuracy: 0.4410
Epoch 53/100
9121/9121 [==============================] - 7s 780us/step - loss: 1.2709 - accuracy: 0.5685 - val_loss: 1.6117 - val_accuracy: 0.4416
Epoch 54/100
9121/9121 [==============================] - 7s 777us/step - loss: 1.2664 - accuracy: 0.5672 - val_loss: 1.6338 - val_accuracy: 0.4331
Epoch 55/100
9121/9121 [==============================] - 7s 788us/step - loss: 1.2550 - accuracy: 0.5726 - val_loss: 1.6186 - val_accuracy: 0.4443
Epoch 56/100
9121/9121 [==============================] - 7s 792us/step - loss: 1.2478 - accuracy: 0.5793 - val_loss: 1.6223 - val_accuracy: 0.4318
Epoch 57/100
9121/9121 [==============================] - 7s 786us/step - loss: 1.2373 - accuracy: 0.5748 - val_loss: 1.5933 - val_accuracy: 0.4489
Epoch 58/100
9121/9121 [==============================] - 7s 782us/step - loss: 1.2254 - accuracy: 0.5834 - val_loss: 1.6193 - val_accuracy: 0.4351
Epoch 59/100
9121/9121 [==============================] - 7s 772us/step - loss: 1.2256 - accuracy: 0.5850 - val_loss: 1.6005 - val_accuracy: 0.4295
Epoch 60/100
9121/9121 [==============================] - 7s 781us/step - loss: 1.2118 - accuracy: 0.5812 - val_loss: 1.6235 - val_accuracy: 0.4403
Epoch 61/100
9121/9121 [==============================] - 7s 777us/step - loss: 1.2057 - accuracy: 0.5896 - val_loss: 1.6467 - val_accuracy: 0.4288
Epoch 62/100
9121/9121 [==============================] - 7s 775us/step - loss: 1.1935 - accuracy: 0.5976 - val_loss: 1.5900 - val_accuracy: 0.4512
Epoch 63/100
9121/9121 [==============================] - 7s 781us/step - loss: 1.1850 - accuracy: 0.5999 - val_loss: 1.5794 - val_accuracy: 0.4466
Epoch 64/100
9121/9121 [==============================] - 7s 781us/step - loss: 1.1817 - accuracy: 0.6006 - val_loss: 1.6366 - val_accuracy: 0.4268
Epoch 65/100
9121/9121 [==============================] - 7s 775us/step - loss: 1.1702 - accuracy: 0.6017 - val_loss: 1.6076 - val_accuracy: 0.4393
Epoch 66/100
9121/9121 [==============================] - 7s 774us/step - loss: 1.1689 - accuracy: 0.6017 - val_loss: 1.6414 - val_accuracy: 0.4278
Epoch 67/100
9121/9121 [==============================] - 7s 776us/step - loss: 1.1575 - accuracy: 0.6036 - val_loss: 1.5956 - val_accuracy: 0.4443
Epoch 68/100
9121/9121 [==============================] - 7s 776us/step - loss: 1.1481 - accuracy: 0.6150 - val_loss: 1.6118 - val_accuracy: 0.4377
Epoch 69/100
9121/9121 [==============================] - 7s 789us/step - loss: 1.1296 - accuracy: 0.6202 - val_loss: 1.6610 - val_accuracy: 0.4258
Epoch 70/100
9121/9121 [==============================] - 7s 782us/step - loss: 1.1279 - accuracy: 0.6164 - val_loss: 1.6441 - val_accuracy: 0.4239
Epoch 71/100
9121/9121 [==============================] - 7s 780us/step - loss: 1.1143 - accuracy: 0.6222 - val_loss: 1.6256 - val_accuracy: 0.4308
Epoch 72/100
9121/9121 [==============================] - 7s 780us/step - loss: 1.1086 - accuracy: 0.6295 - val_loss: 1.6203 - val_accuracy: 0.4314
Epoch 73/100
9121/9121 [==============================] - 7s 782us/step - loss: 1.1037 - accuracy: 0.6279 - val_loss: 1.6133 - val_accuracy: 0.4351
Epoch 74/100
9121/9121 [==============================] - 7s 785us/step - loss: 1.1016 - accuracy: 0.6316 - val_loss: 1.6175 - val_accuracy: 0.4367
Epoch 75/100
9121/9121 [==============================] - 7s 787us/step - loss: 1.0789 - accuracy: 0.6398 - val_loss: 1.6208 - val_accuracy: 0.4374
Epoch 76/100
9121/9121 [==============================] - 7s 776us/step - loss: 1.0817 - accuracy: 0.6356 - val_loss: 1.5897 - val_accuracy: 0.4443
Epoch 77/100
9121/9121 [==============================] - 7s 774us/step - loss: 1.0734 - accuracy: 0.6378 - val_loss: 1.6580 - val_accuracy: 0.4301
Epoch 78/100
9121/9121 [==============================] - 7s 775us/step - loss: 1.0633 - accuracy: 0.6428 - val_loss: 1.7318 - val_accuracy: 0.4124
Epoch 79/100
9121/9121 [==============================] - 8s 837us/step - loss: 1.0599 - accuracy: 0.6408 - val_loss: 1.6058 - val_accuracy: 0.4456
Epoch 80/100
9121/9121 [==============================] - 7s 792us/step - loss: 1.0520 - accuracy: 0.6459 - val_loss: 1.6158 - val_accuracy: 0.4449
Epoch 81/100
9121/9121 [==============================] - 7s 781us/step - loss: 1.0407 - accuracy: 0.6538 - val_loss: 1.6019 - val_accuracy: 0.4406
Epoch 82/100
9121/9121 [==============================] - 7s 779us/step - loss: 1.0229 - accuracy: 0.6633 - val_loss: 1.5816 - val_accuracy: 0.4525
Epoch 83/100
9121/9121 [==============================] - 7s 778us/step - loss: 1.0281 - accuracy: 0.6560 - val_loss: 1.5840 - val_accuracy: 0.4482
Epoch 84/100
9121/9121 [==============================] - 7s 771us/step - loss: 1.0104 - accuracy: 0.6648 - val_loss: 1.5854 - val_accuracy: 0.4410
Epoch 85/100
9121/9121 [==============================] - 7s 776us/step - loss: 1.0065 - accuracy: 0.6656 - val_loss: 1.6201 - val_accuracy: 0.4426
Epoch 86/100
9121/9121 [==============================] - 7s 781us/step - loss: 0.9917 - accuracy: 0.6711 - val_loss: 1.6107 - val_accuracy: 0.4413
Epoch 87/100
9121/9121 [==============================] - 7s 776us/step - loss: 0.9877 - accuracy: 0.6751 - val_loss: 1.6702 - val_accuracy: 0.4222
Epoch 88/100
9121/9121 [==============================] - 7s 772us/step - loss: 0.9769 - accuracy: 0.6739 - val_loss: 1.6293 - val_accuracy: 0.4344
Epoch 89/100
9121/9121 [==============================] - 7s 773us/step - loss: 0.9829 - accuracy: 0.6756 - val_loss: 1.6195 - val_accuracy: 0.4344
Epoch 90/100
9121/9121 [==============================] - 7s 776us/step - loss: 0.9573 - accuracy: 0.6873 - val_loss: 1.6102 - val_accuracy: 0.4383
Epoch 91/100
9121/9121 [==============================] - 7s 788us/step - loss: 0.9554 - accuracy: 0.6861 - val_loss: 1.6078 - val_accuracy: 0.4410
Epoch 92/100
9121/9121 [==============================] - 7s 775us/step - loss: 0.9591 - accuracy: 0.6790 - val_loss: 1.6139 - val_accuracy: 0.4367
Epoch 93/100
9121/9121 [==============================] - 7s 790us/step - loss: 0.9313 - accuracy: 0.6904 - val_loss: 1.6603 - val_accuracy: 0.4206
Epoch 94/100
9121/9121 [==============================] - 7s 785us/step - loss: 0.9244 - accuracy: 0.6901 - val_loss: 1.5905 - val_accuracy: 0.4423
Epoch 95/100
9121/9121 [==============================] - 7s 775us/step - loss: 0.9190 - accuracy: 0.6972 - val_loss: 1.6734 - val_accuracy: 0.4268
Epoch 96/100
9121/9121 [==============================] - 7s 779us/step - loss: 0.9199 - accuracy: 0.6963 - val_loss: 1.6554 - val_accuracy: 0.4344
Epoch 97/100
9121/9121 [==============================] - 7s 789us/step - loss: 0.9081 - accuracy: 0.6990 - val_loss: 1.6261 - val_accuracy: 0.4370
Epoch 98/100
9121/9121 [==============================] - 7s 777us/step - loss: 0.9035 - accuracy: 0.7012 - val_loss: 1.6060 - val_accuracy: 0.4400
Epoch 99/100
9121/9121 [==============================] - 7s 778us/step - loss: 0.8971 - accuracy: 0.7058 - val_loss: 1.5985 - val_accuracy: 0.4429
Epoch 100/100
9121/9121 [==============================] - 7s 780us/step - loss: 0.8831 - accuracy: 0.7063 - val_loss: 1.6085 - val_accuracy: 0.4462
In [36]:
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
In [37]:
# Save model and weights
model_name = 'Emotion_Model.h5'
save_dir = os.path.join(os.getcwd(), 'saved_models')

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
model.save(model_path)
print('Save model and weights at %s ' % model_path)

# Save the model to disk
model_json = model.to_json()
with open("model_json.json", "w") as json_file:
    json_file.write(model_json)
Save model and weights at /kaggle/working/saved_models/Emotion_Model.h5 
In [38]:
# loading json and model architecture 
json_file = open('model_json.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("saved_models/Emotion_Model.h5")
print("Loaded model from disk")
 
# Keras optimiser
opt = keras.optimizers.rmsprop(lr=0.00001, decay=1e-6)
loaded_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
score = loaded_model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
Loaded model from disk
accuracy: 44.62%
In [39]:
preds = loaded_model.predict(X_test, 
                         batch_size=16, 
                         verbose=1)

preds=preds.argmax(axis=1)
preds
3041/3041 [==============================] - 1s 237us/step
Out[39]:
array([11,  3,  8, ...,  7,  4,  0])
In [40]:
# predictions 
preds = preds.astype(int).flatten()
preds = (lb.inverse_transform((preds)))
preds = pd.DataFrame({'predictedvalues': preds})

# Actual labels
actual=y_test.argmax(axis=1)
actual = actual.astype(int).flatten()
actual = (lb.inverse_transform((actual)))
actual = pd.DataFrame({'actualvalues': actual})

# Lets combined both of them into a single dataframe
finaldf = actual.join(preds)
finaldf[170:180]
Out[40]:
actualvalues predictedvalues
170 male_sad male_sad
171 female_happy female_happy
172 male_angry male_angry
173 female_disgust female_disgust
174 male_angry male_angry
175 female_fear female_happy
176 male_angry male_happy
177 female_fear male_sad
178 female_happy male_neutral
179 female_neutral female_neutral
In [41]:
# Write out the predictions to disk
finaldf.to_csv('Predictions.csv', index=False)
finaldf.groupby('predictedvalues').count()
Out[41]:
actualvalues
predictedvalues
female_angry 324
female_disgust 359
female_fear 225
female_happy 329
female_neutral 251
female_sad 188
female_surprise 103
male_angry 161
male_disgust 118
male_fear 89
male_happy 164
male_neutral 238
male_sad 460
male_surprise 32

Emotion vs Gender Accuracy

In [42]:
# the confusion matrix heat map plot
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
        
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
        
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Gender recode function
def gender(row):
    if row == 'female_disgust' or 'female_fear' or 'female_happy' or 'female_sad' or 'female_surprise' or 'female_neutral':
        return 'female'
    elif row == 'male_angry' or 'male_fear' or 'male_happy' or 'male_sad' or 'male_surprise' or 'male_neutral' or 'male_disgust':
        return 'male'
In [43]:
# Get the predictions file 
finaldf = pd.read_csv("Predictions.csv")
classes = finaldf.actualvalues.unique()
classes.sort()    

# Confusion matrix 
c = confusion_matrix(finaldf.actualvalues, finaldf.predictedvalues)
print(accuracy_score(finaldf.actualvalues, finaldf.predictedvalues))
print_confusion_matrix(c, class_names = classes)
0.4462347911871095
In [44]:
# Classification report 
classes = finaldf.actualvalues.unique()
classes.sort()    
print(classification_report(finaldf.actualvalues, finaldf.predictedvalues, target_names=classes))
                 precision    recall  f1-score   support

   female_angry       0.54      0.62      0.58       284
 female_disgust       0.42      0.56      0.48       272
    female_fear       0.52      0.42      0.46       275
   female_happy       0.46      0.50      0.48       299
 female_neutral       0.49      0.53      0.51       233
     female_sad       0.68      0.47      0.56       273
female_surprise       0.91      0.73      0.81       129
     male_angry       0.67      0.52      0.59       206
   male_disgust       0.35      0.21      0.26       197
      male_fear       0.29      0.13      0.18       193
     male_happy       0.32      0.26      0.28       201
   male_neutral       0.33      0.37      0.35       211
       male_sad       0.21      0.44      0.28       221
  male_surprise       0.50      0.34      0.41        47

       accuracy                           0.45      3041
      macro avg       0.48      0.44      0.45      3041
   weighted avg       0.47      0.45      0.45      3041

Gender Accuracy

In [45]:
modidf = finaldf
modidf['actualvalues'] = finaldf.actualvalues.replace({'female_angry':'female'
                                       , 'female_disgust':'female'
                                       , 'female_fear':'female'
                                       , 'female_happy':'female'
                                       , 'female_sad':'female'
                                       , 'female_surprise':'female'
                                       , 'female_neutral':'female'
                                       , 'male_angry':'male'
                                       , 'male_fear':'male'
                                       , 'male_happy':'male'
                                       , 'male_sad':'male'
                                       , 'male_surprise':'male'
                                       , 'male_neutral':'male'
                                       , 'male_disgust':'male'
                                      })

modidf['predictedvalues'] = finaldf.predictedvalues.replace({'female_angry':'female'
                                       , 'female_disgust':'female'
                                       , 'female_fear':'female'
                                       , 'female_happy':'female'
                                       , 'female_sad':'female'
                                       , 'female_surprise':'female'
                                       , 'female_neutral':'female'
                                       , 'male_angry':'male'
                                       , 'male_fear':'male'
                                       , 'male_happy':'male'
                                       , 'male_sad':'male'
                                       , 'male_surprise':'male'
                                       , 'male_neutral':'male'
                                       , 'male_disgust':'male'
                                      })

classes = modidf.actualvalues.unique()  
classes.sort() 

# Confusion matrix 
c = confusion_matrix(modidf.actualvalues, modidf.predictedvalues)
print(accuracy_score(modidf.actualvalues, modidf.predictedvalues))
print_confusion_matrix(c, class_names = classes)
0.7974350542584676
In [46]:
# Classification report 
classes = modidf.actualvalues.unique()
classes.sort()    
print(classification_report(modidf.actualvalues, modidf.predictedvalues, target_names=classes))
              precision    recall  f1-score   support

      female       0.82      0.83      0.83      1765
        male       0.76      0.75      0.76      1276

    accuracy                           0.80      3041
   macro avg       0.79      0.79      0.79      3041
weighted avg       0.80      0.80      0.80      3041

Emotion Accuracy

In [47]:
modidf = pd.read_csv("Predictions.csv")
modidf['actualvalues'] = modidf.actualvalues.replace({'female_angry':'angry'
                                       , 'female_disgust':'disgust'
                                       , 'female_fear':'fear'
                                       , 'female_happy':'happy'
                                       , 'female_sad':'sad'
                                       , 'female_surprise':'surprise'
                                       , 'female_neutral':'neutral'
                                       , 'male_angry':'angry'
                                       , 'male_fear':'fear'
                                       , 'male_happy':'happy'
                                       , 'male_sad':'sad'
                                       , 'male_surprise':'surprise'
                                       , 'male_neutral':'neutral'
                                       , 'male_disgust':'disgust'
                                      })

modidf['predictedvalues'] = modidf.predictedvalues.replace({'female_angry':'angry'
                                       , 'female_disgust':'disgust'
                                       , 'female_fear':'fear'
                                       , 'female_happy':'happy'
                                       , 'female_sad':'sad'
                                       , 'female_surprise':'surprise'
                                       , 'female_neutral':'neutral'
                                       , 'male_angry':'angry'
                                       , 'male_fear':'fear'
                                       , 'male_happy':'happy'
                                       , 'male_sad':'sad'
                                       , 'male_surprise':'surprise'
                                       , 'male_neutral':'neutral'
                                       , 'male_disgust':'disgust'
                                      })

classes = modidf.actualvalues.unique() 
classes.sort() 

# Confusion matrix 
c = confusion_matrix(modidf.actualvalues, modidf.predictedvalues)
print(accuracy_score(modidf.actualvalues, modidf.predictedvalues))
print_confusion_matrix(c, class_names = classes)
0.4995067412035515
In [48]:
# Classification report 
classes = modidf.actualvalues.unique()
classes.sort()    
print(classification_report(modidf.actualvalues, modidf.predictedvalues, target_names=classes))
              precision    recall  f1-score   support

       angry       0.64      0.63      0.63       490
     disgust       0.45      0.45      0.45       469
        fear       0.50      0.33      0.40       468
       happy       0.45      0.45      0.45       500
     neutral       0.46      0.50      0.48       444
         sad       0.43      0.57      0.49       494
    surprise       0.83      0.64      0.72       176

    accuracy                           0.50      3041
   macro avg       0.54      0.51      0.52      3041
weighted avg       0.51      0.50      0.50      3041

Testing the Model on External Audio

In [49]:
from keras.models import Sequential, Model, model_from_json
import matplotlib.pyplot as plt
import keras 
import pickle
import wave  # !pip install wave
import os
import pandas as pd
import numpy as np
import sys
import warnings
import librosa
import librosa.display
import IPython.display as ipd  # To play sound in the notebook

# ignore warnings 
if not sys.warnoptions:
    warnings.simplefilter("ignore")
In [50]:
data, sampling_rate = librosa.load('/kaggle/input/externaltest/DC_d02.wav')
ipd.Audio('/kaggle/input/externaltest/DC_d02.wav')
Out[50]:
In [51]:
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)
Out[51]:
<matplotlib.collections.PolyCollection at 0x7ff9506dc1d0>
In [52]:
# loading json and model architecture 
json_file = open('model_json.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("saved_models/Emotion_Model.h5")
print("Loaded model from disk")
 
# Keras optimiser
opt = keras.optimizers.rmsprop(lr=0.00001, decay=1e-6)
loaded_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
Loaded model from disk
In [53]:
# Lets transform the dataset so we can apply the predictions
X, sample_rate = librosa.load('/kaggle/input/externaltest/DC_d02.wav'
                              ,res_type='kaiser_fast'
                              ,duration=2.5
                              ,sr=44100
                              ,offset=0.5
                             )

sample_rate = np.array(sample_rate)
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13),axis=0)
newdf = pd.DataFrame(data=mfccs).T
newdf
Out[53]:
0 1 2 3 4 5 6 7 8 9 ... 206 207 208 209 210 211 212 213 214 215
0 -25.927307 -25.113689 -25.729801 -27.127584 -25.885874 -24.155733 -25.008343 -25.994907 -26.17831 -26.852377 ... -10.183812 -10.571394 -11.709736 -12.798611 -13.809805 -12.855606 -10.632426 -8.444742 -5.378166 -2.130258

1 rows × 216 columns

Predictions

In [54]:
# Apply predictions
newdf= np.expand_dims(newdf, axis=2)
newpred = loaded_model.predict(newdf, 
                         batch_size=16, 
                         verbose=1)

newpred
1/1 [==============================] - 0s 111ms/step
Out[54]:
array([[9.0809428e-16, 1.0009815e-06, 4.8491597e-12, 2.6200311e-11,
        4.4584823e-14, 8.3644489e-12, 6.3604724e-01, 7.8486807e-17,
        1.3142330e-02, 6.2030228e-04, 3.4374709e-10, 3.1237639e-06,
        3.5018596e-01, 2.7444181e-11]], dtype=float32)
In [55]:
filename = '/kaggle/input/labels/labels'
infile = open(filename,'rb')
lb = pickle.load(infile)
infile.close()

# Get the final predicted label
final = newpred.argmax(axis=1)
final = final.astype(int).flatten()
final = (lb.inverse_transform((final)))
print(final) #emo(final) #gender(final) 
['female_surprise']

Second Test

In [56]:
data, sampling_rate = librosa.load('/kaggle/input/externaltest/DC_d06.wav')
ipd.Audio('/kaggle/input/externaltest/DC_d06.wav')
Out[56]:
In [57]:
plt.figure(figsize=(15, 5))
librosa.display.waveplot(data, sr=sampling_rate)
Out[57]:
<matplotlib.collections.PolyCollection at 0x7ff9502e7a20>
In [58]:
# loading json and model architecture 
json_file = open('model_json.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("saved_models/Emotion_Model.h5")
print("Loaded model from disk")
 
# Keras optimiser
opt = keras.optimizers.rmsprop(lr=0.00001, decay=1e-6)
loaded_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
Loaded model from disk
In [59]:
# Lets transform the dataset so we can apply the predictions
X, sample_rate = librosa.load('/kaggle/input/externaltest/DC_d06.wav'
                              ,res_type='kaiser_fast'
                              ,duration=2.5
                              ,sr=44100
                              ,offset=0.5
                             )

sample_rate = np.array(sample_rate)
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13),axis=0)
newdf = pd.DataFrame(data=mfccs).T
newdf
Out[59]:
0 1 2 3 4 5 6 7 8 9 ... 206 207 208 209 210 211 212 213 214 215
0 -17.685799 -10.543396 -7.518021 -7.506943 -7.403336 -7.362275 -8.112684 -6.900737 -7.539662 -7.677541 ... -25.768373 -24.080482 -23.160265 -23.539013 -23.78928 -25.096966 -24.103725 -22.289076 -23.330709 -23.340109

1 rows × 216 columns

In [60]:
# Apply predictions
newdf= np.expand_dims(newdf, axis=2)
newpred = loaded_model.predict(newdf, 
                         batch_size=16, 
                         verbose=1)

newpred
1/1 [==============================] - 0s 111ms/step
Out[60]:
array([[1.3512749e-37, 3.2906085e-26, 2.6094519e-31, 7.0773645e-30,
        3.9505280e-20, 3.0474707e-21, 5.1564395e-17, 1.8682977e-28,
        1.0000000e+00, 1.0790410e-15, 9.1852875e-20, 9.9736712e-13,
        2.5039565e-10, 4.2832546e-20]], dtype=float32)
In [61]:
filename = '/kaggle/input/labels/labels'
infile = open(filename,'rb')
lb = pickle.load(infile)
infile.close()

# Get the final predicted label
final = newpred.argmax(axis=1)
final = final.astype(int).flatten()
final = (lb.inverse_transform((final)))
print(final) #emo(final) #gender(final) 
['male_disgust']